PRACTICA 3 - Business Score Analysis Report¶

3. Modelization¶

Ignacio González - Saúl Segura¶

08/01/2023¶

#

Libraries¶

In [166]:
import pandas as pd
import numpy as np
import warnings
import re
import nltk
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, roc_curve,roc_auc_score, \
                            classification_report, confusion_matrix, \
                            precision_recall_curve, precision_score, \
                            f1_score, fbeta_score
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
import scikitplot as skplt
import shap
import plotly.express as px
from imblearn.over_sampling import SMOTE
from sklearn.inspection import PartialDependenceDisplay


pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

Functions¶

In [69]:
def get_models_score(classifiers, X_train, Y_train):
    
    for classifier in classifiers:

        classifier.fit(X_train, Y_train)  
        y_pred = classifier.predict(X_test)
        print(classifier)
        print("model score: %.3f" % classifier.score(X_test, Y_test))
        print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))
        
def get_model_report(Y_test, y_pred):
    
    print("Classification Report")
    print(classification_report(Y_test, y_pred))
    
def get_confusion_matrix(model, X_test, Y_test):
    
    titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]

    for title, normalize in titles_options:
        
        disp = plot_confusion_matrix(model, X_test, Y_test,

                                     cmap=plt.cm.Blues,
                                     normalize=normalize)
        disp.ax_.set_title(title)

        print(title)
        print(disp.confusion_matrix)

    plt.show()
    
    
def transform_ratings(row):
    
    if row['stars_x'] <= 3:
        return 0
    else:
        return 1
    
def get_roc_curves(ytest, yhat):
    
    
    fpr, tpr, thresholds = roc_curve(ytest, yhat)
    
    plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
    plt.plot(fpr, tpr, marker='.', label='LGBM')
   
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    
    plt.show()
    
    gmeans = np.sqrt(tpr * (1-fpr))
    
    ix = np.argmax(gmeans)
    print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))
    
    plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
    plt.plot(fpr, tpr, marker='.', label='LGBM')
    plt.scatter(fpr[ix], tpr[ix], s=100, marker='o', color='black', label='Best')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()

def get_precission_recall_curve(ytest):
    
    no_skill = len(ytest[ytest==1]) / len(ytest)
    plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
    plt.plot(recall, precision, marker='.', label='LGBM')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()
    plt.show()
    
    
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
    no_skill = len(ytest[ytest==1]) / len(ytest)
    plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
    plt.plot(recall, precision, marker='.', label='LGBM')
    plt.scatter(recall[ix], precision[ix], s=100, marker='o', color='black', label='Best')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()
    plt.show()
    
def get_pie_chart(df):
    
    fig_1 = px.pie(df,
               names='index',
               height=400,
               width=600,
               hole=0.4,
               title='Nightlife Businesses Stars Distribution',
               values='percent',
               color_discrete_sequence=['#2AA10F','#92E000','#E1FF00','#F58B00','#DE3700']
               )

    fig_1.update_layout(legend=dict(orientation='h', yanchor='bottom', y=-0.2, xanchor='center', x=0.5))
    fig_1.show()
        

Parametrisation¶

In [70]:
nightlife_businesses_path = '../data/Processed/df_nightlife_PA_reviews.csv'
seed = 123456
beta = 2
ind = 6

Warnings¶

In [71]:
warnings.filterwarnings('ignore')

Data¶

In [72]:
nightlife_businesses_PA = pd.read_csv(nightlife_businesses_path)
nightlife_businesses_PA.head()
Out[72]:
review_id business_id stars_x useful funny cool date keywords BusinessAcceptsCreditCards RestaurantsDelivery OutdoorSeating BikeParking RestaurantsPriceRange2 RestaurantsTakeOut ByAppointmentOnly WiFi Alcohol Caters WheelchairAccessible GoodForKids RestaurantsAttire RestaurantsReservations CoatCheck DogsAllowed RestaurantsTableService RestaurantsGoodForGroups HasTV HappyHour DriveThru NoiseLevel BusinessAcceptsBitcoin AcceptsInsurance Smoking GoodForDancing BYOB Corkage BYOBCorkage HairSpecializesIn Open24Hours RestaurantsCounterService AgesAllowed DietaryRestrictions Monday Tuesday Wednesday Thursday Friday Saturday Sunday name postal_code latitude longitude stars_y review_count
0 oyaMhzBSwfGgemSGuZCdwQ YtSqYv1Q_pOltsVPSx54SA 5.0 0.0 0.0 0.0 2013-06-24 11:21:25 ['Tremendous', 'service', 'Big', 'shout', 'Dou... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
1 iOQ_bnKI5HfPbH43DMAw6w YtSqYv1Q_pOltsVPSx54SA 3.0 0.0 0.0 0.0 2013-01-27 19:22:26 ['good', 'place', 'lofty', 'prices', 'proporti... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
2 rzrBiijeQh7ubjfRCr-UtA YtSqYv1Q_pOltsVPSx54SA 4.0 12.0 11.0 11.0 2008-04-30 15:26:12 ['bar', 'area', 'upscale', 'cities', 'restaura... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
3 1HP3yZN3jT646IlHSo7GZw YtSqYv1Q_pOltsVPSx54SA 5.0 0.0 0.0 0.0 2014-06-11 16:10:04 ['prime', 'rib', 'steak', 'joints', 'experience'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
4 Vv6acqoztdtzTD8Gq0gifA YtSqYv1Q_pOltsVPSx54SA 5.0 0.0 0.0 0.0 2018-03-04 00:43:27 ['name', 'Best', 'Prime', 'Rib', 'town', 'Serv... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290

We have imported the nightlife businesses PA dataset , which contains all the data related to the nightlife businesses in the state of Pennsylvania.

According to the use case that we have proposed, which is, predicting the rating of a review based on the comments of the review itself, we have decided to extract the most important keywords from the text to eliminate the noise it may have. In this way, we believe that we will facilitate the work to the model in order to predict the rating.

We believe that predicting the rating of a business would be counterproductive, since this rating is calculated based on the average of all the reviews of the business. Therefore, it would not make sense to use the rating of the business as the objective variable, if not the rating of the review. This would be a more realistic goal. Therefore, for the model, as the objective variable we will use the variable stars_x that collects the ratings of the reviews.

In [73]:
print("Total No. of Reviews: {}".format(nightlife_businesses_PA.shape[0]))
Total No. of Reviews: 261678

We have a total of 261678 reviews.

In [74]:
nightlife_businesses_PA_stars = nightlife_businesses_PA["stars_x"].value_counts(normalize=True).mul(100).rename('percent').reset_index()
nightlife_businesses_PA_count = nightlife_businesses_PA["stars_x"].value_counts().reset_index()
nightlife_businesses_PA_count_pct = pd.merge(nightlife_businesses_PA_stars, nightlife_businesses_PA_count, on=['index'], how='inner')
nightlife_businesses_PA_count_pct
Out[74]:
index percent stars_x
0 5.0 37.475065 98064
1 4.0 28.313806 74091
2 3.0 13.955701 36519
3 1.0 10.879019 28468
4 2.0 9.376409 24536
In [75]:
get_pie_chart(nightlife_businesses_PA_count_pct)
In [76]:
nightlife_businesses_PA["stars_x"].value_counts()
Out[76]:
5.0    98064
4.0    74091
3.0    36519
1.0    28468
2.0    24536
Name: stars_x, dtype: int64

We have a clear predominance of reviews with a rating between 4 and 5, which would represent a good comment from the user.

To facilitate the work of the model, we are going to convert this variable to an integer type since the model only accepts values of this type.

In [77]:
nightlife_businesses_PA['stars_x'] = nightlife_businesses_PA['stars_x'].round().astype(int)
nightlife_businesses_PA.head(50)
Out[77]:
review_id business_id stars_x useful funny cool date keywords BusinessAcceptsCreditCards RestaurantsDelivery OutdoorSeating BikeParking RestaurantsPriceRange2 RestaurantsTakeOut ByAppointmentOnly WiFi Alcohol Caters WheelchairAccessible GoodForKids RestaurantsAttire RestaurantsReservations CoatCheck DogsAllowed RestaurantsTableService RestaurantsGoodForGroups HasTV HappyHour DriveThru NoiseLevel BusinessAcceptsBitcoin AcceptsInsurance Smoking GoodForDancing BYOB Corkage BYOBCorkage HairSpecializesIn Open24Hours RestaurantsCounterService AgesAllowed DietaryRestrictions Monday Tuesday Wednesday Thursday Friday Saturday Sunday name postal_code latitude longitude stars_y review_count
0 oyaMhzBSwfGgemSGuZCdwQ YtSqYv1Q_pOltsVPSx54SA 5 0.0 0.0 0.0 2013-06-24 11:21:25 ['Tremendous', 'service', 'Big', 'shout', 'Dou... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
1 iOQ_bnKI5HfPbH43DMAw6w YtSqYv1Q_pOltsVPSx54SA 3 0.0 0.0 0.0 2013-01-27 19:22:26 ['good', 'place', 'lofty', 'prices', 'proporti... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
2 rzrBiijeQh7ubjfRCr-UtA YtSqYv1Q_pOltsVPSx54SA 4 12.0 11.0 11.0 2008-04-30 15:26:12 ['bar', 'area', 'upscale', 'cities', 'restaura... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
3 1HP3yZN3jT646IlHSo7GZw YtSqYv1Q_pOltsVPSx54SA 5 0.0 0.0 0.0 2014-06-11 16:10:04 ['prime', 'rib', 'steak', 'joints', 'experience'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
4 Vv6acqoztdtzTD8Gq0gifA YtSqYv1Q_pOltsVPSx54SA 5 0.0 0.0 0.0 2018-03-04 00:43:27 ['name', 'Best', 'Prime', 'Rib', 'town', 'Serv... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
5 8CF6b3nrnAohDawiKv87TA YtSqYv1Q_pOltsVPSx54SA 5 0.0 0.0 0.0 2016-02-23 19:40:10 ['Prime', 'Rib', 'offer', 'cut'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
6 E6qHEOzFhGiYAn5cgzbZkQ YtSqYv1Q_pOltsVPSx54SA 2 3.0 0.0 0.0 2010-07-07 21:29:29 ['nothing', 'thats', 'food'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
7 vftJTlt7KVMtCRbcBZNSwg YtSqYv1Q_pOltsVPSx54SA 5 1.0 0.0 0.0 2014-12-18 20:47:25 ['time', 'cut', 'wine', 'full', 'boyfriend'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
8 wdcjv9W7RodJofnvBzK6FQ YtSqYv1Q_pOltsVPSx54SA 5 1.0 0.0 0.0 2014-09-10 13:25:46 ['friend', 'food', 'best', 'Phila', 'night'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
9 nd29xztgxcH1cV0Srf9WAw YtSqYv1Q_pOltsVPSx54SA 1 2.0 3.0 0.0 2016-10-02 00:32:34 ['place', 'negative', 'stars'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
10 IY7i04LMohgO7HIk5pYvew YtSqYv1Q_pOltsVPSx54SA 5 2.0 2.0 2.0 2005-09-23 13:05:56 ['fact', 'part', 'small'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
11 r0Rg8B4XDsBf8KomsPijmw YtSqYv1Q_pOltsVPSx54SA 5 2.0 1.0 0.0 2013-01-28 02:15:10 ['restaurant', 'week', 'place', 'years', 'times'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
12 RWSb10J0em22lTpFhZ5QZQ YtSqYv1Q_pOltsVPSx54SA 3 0.0 0.0 0.0 2017-11-07 14:03:03 ['crab', 'food', 'fine', 'cakes'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
13 A50zoocWps2aiZzbDCk6Iw YtSqYv1Q_pOltsVPSx54SA 2 5.0 2.0 4.0 2009-08-07 15:17:30 ['good', 'fancy', 'half', 'cut'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
14 UBSnRKNmhyKdx-PTn8abUA YtSqYv1Q_pOltsVPSx54SA 2 1.0 0.0 0.0 2013-08-16 04:34:42 ['way', 'food', 'date', 'place', 'server', 'me... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
15 D599DF6m2Z6XC7R1e4xHkg YtSqYv1Q_pOltsVPSx54SA 5 1.0 1.0 0.0 2017-02-20 03:56:36 ['salad', 'prime', 'rib', 'delicious'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
16 q1bg-VKtl0YHpqwOYfbTzg YtSqYv1Q_pOltsVPSx54SA 4 1.0 0.0 0.0 2016-01-20 02:08:26 ['great', 'prime', 'rib', 'month', 'chocolate'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
17 mo3pqyAumP4BG4Nz48FiwA YtSqYv1Q_pOltsVPSx54SA 4 0.0 0.0 0.0 2016-06-25 12:10:26 ['BAR', 'expensive', 'worth', 'account'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
18 jZizwcu-gH9MYrLpXOcauw YtSqYv1Q_pOltsVPSx54SA 5 0.0 0.0 0.0 2017-12-30 22:53:46 ['fantastic', 'wonderful', 'style', 'steakhous... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
19 UG8HGpvSl8GWlZnHAUHZPg YtSqYv1Q_pOltsVPSx54SA 5 0.0 2.0 1.0 2014-05-28 12:21:15 ['food', 'kids'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
20 GmqFBtJkxbh1UjTDPnBJJQ YtSqYv1Q_pOltsVPSx54SA 2 0.0 0.0 0.0 2017-12-04 14:22:57 ['nice', 'service', 'time', 'Philly', 'OK'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
21 BqtVaxlau4qV8cqtEFDd4w YtSqYv1Q_pOltsVPSx54SA 3 2.0 2.0 2.0 2009-10-26 04:41:23 ['steak', 'please', 'medium', 'rare', 'LOVE'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
22 RvZHRve8_DHD957g2Mlpwg YtSqYv1Q_pOltsVPSx54SA 4 0.0 0.0 0.0 2012-04-01 20:25:52 ['good', 'business', 'review', 'old', 'money',... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
23 cvExhHIpO4fXilY9hGi5Kw YtSqYv1Q_pOltsVPSx54SA 1 0.0 0.0 0.0 2015-09-02 01:04:11 ['general', 'manager', 'disrespectful', 'food'... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
24 rFEvx_pu0Y0JtEZuzBzTQA YtSqYv1Q_pOltsVPSx54SA 4 0.0 0.0 0.0 2014-09-29 13:48:42 ['tender', 'good', 'place', 'look'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
25 XdP_JZI9jESTPqW8YBWf-Q YtSqYv1Q_pOltsVPSx54SA 5 0.0 0.0 0.0 2012-04-04 10:02:09 ['trip', 'Prime', 'Rib', 'meal', 'selection', ... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
26 I6nS0j-m1ifQHWq4C2mzLg YtSqYv1Q_pOltsVPSx54SA 5 4.0 2.0 3.0 2011-01-18 20:27:05 ['steak', 'prime', 'rib', 'Philly', 'good'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
27 I1IKWTGEZF1vxFNo_G8f5A YtSqYv1Q_pOltsVPSx54SA 5 0.0 0.0 0.0 2015-05-02 02:13:32 ['Best', 'Prime', 'rib', 'Everything', 'Love',... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
28 NW8H4MU1uz-xx18CZJKy7A YtSqYv1Q_pOltsVPSx54SA 5 0.0 0.0 0.0 2012-09-16 22:34:05 ['nights', 'favorite', 'prime', 'rib', 'potato'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
29 -5Ej9ImJ51-WCmX4fyvETA YtSqYv1Q_pOltsVPSx54SA 2 0.0 3.0 0.0 2013-05-31 12:20:34 ['full', 'old', 'men', 'Tony', 'Soprano', 'liv... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
30 OUWmGzDpvFbW3MdZkVeDNQ YtSqYv1Q_pOltsVPSx54SA 3 0.0 0.0 0.0 2013-08-05 12:54:01 ['good', 'wearing', 'waiter', 'table', 'captai... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
31 mQ0YpErPdsHTd2KXPBhF2Q YtSqYv1Q_pOltsVPSx54SA 1 0.0 0.0 0.0 2016-03-19 01:10:23 ['mice', 'scurry', 'floor', 'server'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
32 W0rfL71LsLlV7Jl6SEoR6Q YtSqYv1Q_pOltsVPSx54SA 4 0.0 0.0 0.0 2011-10-07 13:32:08 ['food', 'Antonio'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
33 ZdIu2b27brSWFqG_pJDLeA YtSqYv1Q_pOltsVPSx54SA 5 4.0 1.0 2.0 2014-04-03 15:59:19 ['wife', 'rehearsal', 'dinner', 'wedding'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
34 3gFGEmRwa9cjgmcgyU8TlQ YtSqYv1Q_pOltsVPSx54SA 4 0.0 0.0 0.0 2010-11-27 04:26:33 ['pleased', 'table', 'shrimp', 'salad', 'dinner'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
35 B3bB1EVtODLW-15qBGgKFA YtSqYv1Q_pOltsVPSx54SA 5 0.0 0.0 0.0 2010-05-17 14:58:28 ['great', 'family', 'Prime', 'Rib', 'meal', 'E... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
36 -nxuCi0p3qv1s8DkX2kINg YtSqYv1Q_pOltsVPSx54SA 3 0.0 0.0 0.0 2016-06-01 22:28:06 ['good', 'Prime', 'Rib', 'era', 'service', 'Sa... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
37 X2CTpfyhB6S7WnDDpPnOdg YtSqYv1Q_pOltsVPSx54SA 3 1.0 0.0 0.0 2016-01-21 00:06:56 ['service', 'bad', 'lamb', 'rack', 'good'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
38 9JpxZpcDPTGqIEBoz0IUBg YtSqYv1Q_pOltsVPSx54SA 4 2.0 1.0 0.0 2015-02-05 15:12:55 ['place', 'service', 'Juicy', 'prime', 'rib'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
39 p2SBD6gyw3djvrSh_Daz4Q YtSqYv1Q_pOltsVPSx54SA 4 3.0 1.0 4.0 2008-09-18 18:35:40 ['Prime', 'Rib', 'Restaurant', 'Week'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
40 QL8e1L1Vf1KDwoG2khGNGQ YtSqYv1Q_pOltsVPSx54SA 4 0.0 0.0 0.0 2018-04-29 12:19:46 ['restaurant', 'place', 'Saturday', 'evening'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
41 5NrWXIeE_7LpF4moSpxtAA YtSqYv1Q_pOltsVPSx54SA 5 0.0 0.0 0.0 2011-08-26 00:03:01 ['Warwick', 'restaurant', 'nights', 'dinner'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
42 a90idd_P1oNf103f-nCzWg YtSqYv1Q_pOltsVPSx54SA 4 0.0 0.0 0.0 2012-02-05 13:16:55 ['steak', 'good', 'see', 'quality', 'sides', '... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
43 qCvfeguEDTt1rCHlXp1BdA YtSqYv1Q_pOltsVPSx54SA 5 3.0 0.0 0.0 2014-09-27 21:25:32 ['service', 'bar', 'love', 'Prime', 'Rib', 'El... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
44 jb_ZS62r2RO7wwFkPXTzjQ YtSqYv1Q_pOltsVPSx54SA 4 1.0 0.0 0.0 2010-11-28 16:50:51 ['good', 'Make', 'bar', 'start'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
45 KnH0ZhP3hlqoGftUwR4eaA YtSqYv1Q_pOltsVPSx54SA 5 1.0 0.0 0.0 2009-11-09 16:46:15 ['Prime', 'Rib', 'sort', 'food'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
46 ZhAKHsnlX2r_vrWFNJYZZw YtSqYv1Q_pOltsVPSx54SA 5 0.0 0.0 0.0 2012-05-11 22:52:33 ['Prime', 'Rib', 'DP', 'special'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
47 Fw-h3iTbzNC2BpWWQ6arFA YtSqYv1Q_pOltsVPSx54SA 4 0.0 0.0 1.0 2013-12-10 01:49:09 ['place', 'dinner', 'room', 'eat', 'fancy', 'c... True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
48 HVBOJN5lKvZf3LrjcxTY-A YtSqYv1Q_pOltsVPSx54SA 4 3.0 0.0 0.0 2016-09-19 18:47:12 ['night', 'servers', 'Vinny', 'city'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
49 sY6oP1czbvflB2WkLyIZ7g YtSqYv1Q_pOltsVPSx54SA 5 0.0 0.0 0.0 2018-06-11 04:49:55 ['Love', 'feel'] True True False True 3.0 True NaN 'free' 'full_bar' False True False u'dressy' True True NaN NaN True True True NaN u'average' NaN NaN NaN False NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 16:30-22:0 Rittenhouse Grill 19103 39.948949 -75.169532 3.5 290
In [78]:
nightlife_businesses_PA["stars_x"].value_counts()
Out[78]:
5    98064
4    74091
3    36519
1    28468
2    24536
Name: stars_x, dtype: int64

As we can see, the model is quite unbalanced since we found a clear difference between the two majority classes (rating 4 and 5) compared to the rest of the classes. We will see how it affects the model and based on the results we will consider using a sampling technique.

In [79]:
stars_ratings = [nightlife_businesses_PA['stars_x'] for review in nightlife_businesses_PA]
stars_ratings
Out[79]:
[0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64,
 0         5
 1         3
 2         4
 3         5
 4         5
          ..
 261673    2
 261674    5
 261675    5
 261676    5
 261677    5
 Name: stars_x, Length: 261678, dtype: int64]

Bearing in mind that we are making a sentiment analysis model, we will pass the variable stars_x to the model as the objective variable and the keywords to train the model.

So let's save these two variables.

In [80]:
keywords = nightlife_businesses_PA['keywords'] 
keywords
Out[80]:
0         ['Tremendous', 'service', 'Big', 'shout', 'Dou...
1         ['good', 'place', 'lofty', 'prices', 'proporti...
2         ['bar', 'area', 'upscale', 'cities', 'restaura...
3         ['prime', 'rib', 'steak', 'joints', 'experience']
4         ['name', 'Best', 'Prime', 'Rib', 'town', 'Serv...
                                ...                        
261673    ['family', 'special', 'occasion', 'help', 'dis...
261674                                     ['open', 'spot']
261675    ['beautiful', 'birthday', 'Lark', 'staff', 'de...
261676                    ['good', 'Lark', 'anyone', 'get']
261677    ['food', 'perfect', 'rare', 'restaurant', 'goo...
Name: keywords, Length: 261678, dtype: object
In [81]:
# This vectorizer breaks text into single words and bi-grams
# and then calculates the TF-IDF representation
vectorizer = TfidfVectorizer(ngram_range=(1,1))

# the 'fit' builds up the vocabulary from all the reviews
# while the 'transform' step turns each indivdual text into
# a matrix of numbers.
vectors = vectorizer.fit_transform(keywords)

The TfidfVectorizer is a tool for preprocessing and transforming text data. It converts a collection of text documents to a matrix of numerical features by implementing the following steps:

Tokenization: The vectorizer first breaks down the text into individual words or tokens, in our case we have previously splitted the text into individual keywords.

N-grams: Next, the vectorizer constructs a vocabulary of all the unique n-grams (a contiguous sequence of n items from a given sample of text or speech) in the text data. As we have individual keywords or unigrams, which are, single words, we need to pass to the model a ngram range of (1,1). In the event that we had an entire text, we would have to pass it a range of (1,3).

The fit_transform method fits the vectorizer to the text data and then transforms it into a numerical feature matrix where keywords have a different weight based on the importance of the keyword.

In [82]:
vectors
Out[82]:
<261678x24548 sparse matrix of type '<class 'numpy.float64'>'
	with 1335280 stored elements in Compressed Sparse Row format>

We split into train and test with a proportion of 80-20.

In [83]:
X_train, X_test, Y_train, Y_test = train_test_split(vectors, stars_ratings[1], test_size=0.20, random_state=seed, 
                                                    shuffle =False,)

We have identified, based on previous research that we have conducted, that the following models perform quite well in this scenario.

In [12]:
classifiers = [
    LogisticRegression(),
    MultinomialNB(),
    RandomForestClassifier(),
    LinearSVC()
    ]

get_models_score(classifiers, X_train, Y_train)
LogisticRegression()
model score: 0.485
Accuracy: 0.48492433506572913
MultinomialNB()
model score: 0.463
Accuracy: 0.4634095077957811
RandomForestClassifier()
model score: 0.465
Accuracy: 0.4652247019260165
LinearSVC()
model score: 0.473
Accuracy: 0.47330709263222254

The logistic regression model gives us the best results, let´s see the confusion matrix.

In [84]:
LR = LogisticRegression()
In [85]:
LR = LogisticRegression(penalty='l2', C=1.0, random_state=seed, n_jobs=2)

LR.fit(X_train, Y_train)
y_pred = LR.predict(X_test)

print("model score: %.3f" % LR.score(X_test, Y_test))
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))
<frozen importlib._bootstrap>:228: RuntimeWarning: scipy._lib.messagestream.MessageStream size changed, may indicate binary incompatibility. Expected 56 from C header, got 64 from PyObject
model score: 0.485
Accuracy: 0.48492433506572913
/Users/ignaciogonzalez/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [86]:
get_model_report(Y_test, y_pred)
Classification Report
              precision    recall  f1-score   support

           1       0.56      0.52      0.54      6114
           2       0.31      0.12      0.17      4795
           3       0.35      0.14      0.20      7198
           4       0.39      0.45      0.42     14270
           5       0.56      0.71      0.62     19959

    accuracy                           0.48     52336
   macro avg       0.43      0.39      0.39     52336
weighted avg       0.46      0.48      0.46     52336

As we have been anticipating, the results could be much better. However, we consider that the results obtained are not entirely bad since as the objective variable we have 5 categories or ratings. Therefore, we consider that it can be way more difficult to predict between 5 categories instead of 2.

In [87]:
get_confusion_matrix(LR, X_test, Y_test)
Confusion matrix, without normalization
[[ 3159   448   327   829  1351]
 [ 1031   552   594  1318  1300]
 [  542   374  1025  3080  2177]
 [  412   198   682  6426  6552]
 [  462   189   338  4753 14217]]
Normalized confusion matrix
[[0.51668302 0.07327445 0.05348381 0.13559045 0.22096827]
 [0.21501564 0.11511992 0.12387904 0.27486966 0.27111575]
 [0.07529869 0.05195888 0.14240067 0.42789664 0.30244512]
 [0.02887176 0.01387526 0.04779257 0.45031535 0.45914506]
 [0.02314745 0.00946941 0.01693472 0.23813818 0.71231024]]

As a conclusion, we can deduce that the fact that the objective variable is so unbalanced, it negatively affects the results of the model. This is because, although the model in general does not accurately predict all categories (1-5) we found that it misses a lot in categories 1-3, compared to 4-5. This is due, again, to the fact that it is quite unbalanced.

Therefore, we have decided to create another model in which we pass the ratings variable as the objective variable, but in this case we will pass two categories:

0 for ratings from 1-3 as if it were a bad experience or review

1 for ratings 4 and 5 as if it were a good experience.

Second Model¶

In [88]:
columns_to_keep = ['stars_x', 'keywords']
df_second_model = nightlife_businesses_PA[columns_to_keep]
df_second_model
Out[88]:
stars_x keywords
0 5 ['Tremendous', 'service', 'Big', 'shout', 'Dou...
1 3 ['good', 'place', 'lofty', 'prices', 'proporti...
2 4 ['bar', 'area', 'upscale', 'cities', 'restaura...
3 5 ['prime', 'rib', 'steak', 'joints', 'experience']
4 5 ['name', 'Best', 'Prime', 'Rib', 'town', 'Serv...
... ... ...
261673 2 ['family', 'special', 'occasion', 'help', 'dis...
261674 5 ['open', 'spot']
261675 5 ['beautiful', 'birthday', 'Lark', 'staff', 'de...
261676 5 ['good', 'Lark', 'anyone', 'get']
261677 5 ['food', 'perfect', 'rare', 'restaurant', 'goo...

261678 rows × 2 columns

In [142]:
nan_count = np.count_nonzero(df_second_model.isna())
print(nan_count)
0

As we can see the model doesnt include any nulls, thats why we decided before to not treat them until we have the use case and df prepared.

In [89]:
df_second_model['stars_x'] = df_second_model.apply(transform_ratings, axis=1)

We have created the transform_ratings function to replace de categories 1-5 for 0 and 1 based on the previous assumption.

In [90]:
df_second_model
Out[90]:
stars_x keywords
0 1 ['Tremendous', 'service', 'Big', 'shout', 'Dou...
1 0 ['good', 'place', 'lofty', 'prices', 'proporti...
2 1 ['bar', 'area', 'upscale', 'cities', 'restaura...
3 1 ['prime', 'rib', 'steak', 'joints', 'experience']
4 1 ['name', 'Best', 'Prime', 'Rib', 'town', 'Serv...
... ... ...
261673 0 ['family', 'special', 'occasion', 'help', 'dis...
261674 1 ['open', 'spot']
261675 1 ['beautiful', 'birthday', 'Lark', 'staff', 'de...
261676 1 ['good', 'Lark', 'anyone', 'get']
261677 1 ['food', 'perfect', 'rare', 'restaurant', 'goo...

261678 rows × 2 columns

In [91]:
df_second_model["stars_x"].value_counts()
Out[91]:
1    172155
0     89523
Name: stars_x, dtype: int64

We still have a quite unbalanced variable. Still we will test the model and observe the results. Therefore we create the two variables storing the ratings and keywords that will be passed to the models.

In [92]:
stars_ratings = [df_second_model['stars_x'] for review in df_second_model]
stars_ratings
Out[92]:
[0         1
 1         0
 2         1
 3         1
 4         1
          ..
 261673    0
 261674    1
 261675    1
 261676    1
 261677    1
 Name: stars_x, Length: 261678, dtype: int64,
 0         1
 1         0
 2         1
 3         1
 4         1
          ..
 261673    0
 261674    1
 261675    1
 261676    1
 261677    1
 Name: stars_x, Length: 261678, dtype: int64]
In [93]:
keywords = df_second_model['keywords'] 
keywords
Out[93]:
0         ['Tremendous', 'service', 'Big', 'shout', 'Dou...
1         ['good', 'place', 'lofty', 'prices', 'proporti...
2         ['bar', 'area', 'upscale', 'cities', 'restaura...
3         ['prime', 'rib', 'steak', 'joints', 'experience']
4         ['name', 'Best', 'Prime', 'Rib', 'town', 'Serv...
                                ...                        
261673    ['family', 'special', 'occasion', 'help', 'dis...
261674                                     ['open', 'spot']
261675    ['beautiful', 'birthday', 'Lark', 'staff', 'de...
261676                    ['good', 'Lark', 'anyone', 'get']
261677    ['food', 'perfect', 'rare', 'restaurant', 'goo...
Name: keywords, Length: 261678, dtype: object
In [94]:
# This vectorizer breaks text into single words and uni-grams
# and then calculates the TF-IDF representation
vectorizer = TfidfVectorizer(ngram_range=(1,1))

# the 'fit' builds up the vocabulary from all the reviews
# while the 'transform' step turns each indivdual text into
# a matrix of numbers.
vectors = vectorizer.fit_transform(keywords)
In [95]:
X_train, X_test, Y_train, Y_test = train_test_split(vectors, stars_ratings[1], test_size=0.20, random_state=seed, 
                                                     stratify= stars_ratings[1])
In [45]:
classifiers = [
    LogisticRegression(),
    MultinomialNB(),
    RandomForestClassifier(),
    LinearSVC()
    ]

get_models_score(classifiers, X_train, Y_train)
LogisticRegression()
model score: 0.773
Accuracy: 0.7730243044940385
MultinomialNB()
model score: 0.755
Accuracy: 0.7547386120452461
RandomForestClassifier()
model score: 0.757
Accuracy: 0.7571079180678691
LinearSVC()
model score: 0.765
Accuracy: 0.7648845918679303

We have obtained much better results. Obviously the model behaves much better with fewer categories than with more.

We choose logistic regression to observe the confusion matrix and the curves.

In [96]:
LR = LogisticRegression()
In [97]:
LR = LogisticRegression(penalty='l2', C=1.0, random_state=seed, n_jobs=2)

LR.fit(X_train, Y_train)
y_pred = LR.predict(X_test)

print("model score: %.3f" % LR.score(X_test, Y_test))
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))
<frozen importlib._bootstrap>:228: RuntimeWarning: scipy._lib.messagestream.MessageStream size changed, may indicate binary incompatibility. Expected 56 from C header, got 64 from PyObject
model score: 0.774
Accuracy: 0.7735402017731581
/Users/ignaciogonzalez/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [98]:
get_model_report(Y_test, y_pred)
Classification Report
              precision    recall  f1-score   support

           0       0.73      0.53      0.62     17905
           1       0.79      0.90      0.84     34431

    accuracy                           0.77     52336
   macro avg       0.76      0.72      0.73     52336
weighted avg       0.77      0.77      0.76     52336

In general we get better results compared to the previous model. Let´s see the confusion matrix

In [99]:
get_confusion_matrix(LR, X_test, Y_test)
Confusion matrix, without normalization
[[ 9493  8412]
 [ 3440 30991]]
Normalized confusion matrix
[[0.5301871  0.4698129 ]
 [0.09990996 0.90009004]]

The results are not the best, however they are much better compared to the previous model.

We have a high probability of predicting good reviews based on keywords and a 50% probability of predicting a bad review. This is because the variable is unbalanced.

In the worst case scenario, which would be predicted as a good review but is actually bad, we still have a 50% chance. The model could be better however we are happy with the results.

The conclusion we can draw is that the imbalance of the dataset negatively affects the prediction of category 0

In [100]:
prob_predictions = LR.predict_proba(X_test)
In [101]:
yhat = prob_predictions[:, 1]

ROC CURVE¶

In [102]:
get_roc_curves(Y_test, yhat)
Best Threshold=0.660704, G-Mean=0.743

As we can see in the graph, the roc curve falls under the area at the top-left corner which indicates good performance levels. This also indicates how well the classifier distinguishes between positive and negative cases instead of falling to the right bottom corner which will lead to a bad performance model which will lead to a highly rate in predicting false positives. The second graph indicates the optimal threshold for this model which is 0.63%.

A G-mean of 0.743 is generally considered to be a good score, indicating that the classifier has a good balance between TPR and TNR.

Precision-Recall Curve¶

In [103]:
precision, recall, thresholds = precision_recall_curve(Y_test, yhat)
In [104]:
get_precission_recall_curve(Y_test)
Best Threshold=0.435148, F-Score=0.843

In this case we got a nearly high AUC-PR score,which is the are under the precission recall curve. This curve, as the ROC curve gives us the weighted mean of precision achieved at each threshold. The optimal threshold given by this curve hints us that we are following a correct path as it gives as the same optimal threshold.

In [105]:
score = f1_score(Y_test, y_pred)
print('F-Score: %.5f' % score)
F-Score: 0.83948

The F1 score is a measure of a test's accuracy. It is calculated as the harmonic mean of the precision and recall of the test, with a higher score indicating a better balance between precision and recall.

An F1 score of 0.84 is generally considered to be a good score. It indicates that the classifier has a good balance between precision and recall.

Cumulative gains curve¶

In [106]:
skplt.metrics.plot_cumulative_gain(Y_test, prob_predictions)
plt.show()

According to the cumulative curve, if we approach 20% of our transaction base (x-axis), we will get over 30% of the ratings of category 1 predicted and nearly 50% of the ratings of the category 0. With 40% of the sample we will get 55% of the ratings of category 1 predicted and nearly 80% of category 0.

Lift curve¶

In [107]:
skplt.metrics.plot_lift_curve(Y_test, prob_predictions)
plt.show()

This curve informs us on how much better our model predicts than randomly guessing. For example, using the 20% of our best predictions, our model is about 1.5 and 2,25 times better(for cateogry 1 and 0 respectively) than randomly selecting 10% from our transaction pool.

In [108]:
fbeta_score(Y_test, y_pred, beta=beta)
Out[108]:
0.8748242786249415

The F2 score is a measure of a test's accuracy that weighs recall higher than precision. 0,87 is a good score, with a good balance between precision and recall giving more weight to recall.

We established beta value as 2 to make recall value more important than a Precision one. It focuses on minimizing False Negatives than minimizing False Positives which it is what interest us. In this case, as we are correctly classifing samples(in the majority of the cases for this model),we are getting higher Precision and Recall scores which will give us a higher F-measure value.

Oversampled Model¶

Although we have obtained good results with the model, we have observed that the model heavily penalizes bad experiences or reviews. In other words, since it is quite unbalanced, the model has trouble predicting category 0.

Therefore, we are going to use an oversampling technique to try to get better results.

In [131]:
oversampler = RandomOverSampler(sampling_strategy='minority')
In [132]:
X_train, X_test, Y_train, Y_test = train_test_split(vectors, stars_ratings[1], test_size=0.20, random_state=seed, 
                                                    stratify= stars_ratings[1])
In [133]:
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, Y_train)
In [134]:
y_train_oversampled
Out[134]:
0         1
1         0
2         1
3         1
4         0
         ..
275443    0
275444    0
275445    0
275446    0
275447    0
Name: stars_x, Length: 275448, dtype: int64
In [64]:
classifiers = [
    LogisticRegression(),
    MultinomialNB(),
    RandomForestClassifier(),
    LinearSVC()
    ]

get_models_score(classifiers, X_train_oversampled, y_train_oversampled)
LogisticRegression()
model score: 0.734
Accuracy: 0.7344084377866096
MultinomialNB()
model score: 0.733
Accuracy: 0.73318557016203
RandomForestClassifier()
model score: 0.745
Accuracy: 0.7446690614490982
LinearSVC()
model score: 0.727
Accuracy: 0.727128553959034

As far as we can see, we have not obtained better results in terms of model score and accuracy, Let´s see the confusion matrix

In [113]:
clf = RandomForestClassifier()

clf.fit(X_train_oversampled, y_train_oversampled)
y_pred = clf.predict(X_test)

print("model score: %.3f" % clf.score(X_test, Y_test))
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))
model score: 0.748
Accuracy: 0.7481848058697645
In [135]:
get_model_report(Y_test, y_pred)
Classification Report
              precision    recall  f1-score   support

           0       0.64      0.60      0.62     17905
           1       0.80      0.83      0.81     34431

    accuracy                           0.75     52336
   macro avg       0.72      0.71      0.72     52336
weighted avg       0.74      0.75      0.75     52336

The overall results in precision recall and f1 seem to be a bit worse, however let´s see the confusion matrix as we consider it is one of the most important metrics to observe.

In [136]:
get_confusion_matrix(clf, X_test, Y_test)
Confusion matrix, without normalization
[[10672  7233]
 [ 5946 28485]]
Normalized confusion matrix
[[0.59603463 0.40396537]
 [0.17269321 0.82730679]]

As we can see, the oversampled model seems to predict category 1 worse but category 0 a little better. However, in the case that we had to choose one, taking into account the worst case scenario (which would be predicting a review as good but it is actually bad) you would be getting better results. Therefore, in this aspect the model behaves better since in the unbalanced model we would be failing in 8412 cases compared to 7233. The model would then be 7% better predicting the worst scenario. It is for this reason that if we had to choose a model we would consider staying with this one.

In [155]:
prob_predictions = clf.predict_proba(X_test)
In [156]:
yhat = prob_predictions[:, 1]

 ROC CURVE¶

In [157]:
get_roc_curves(Y_test, yhat)
Best Threshold=0.610588, G-Mean=0.722

We got almost the same results compared to the previous model. We concluded that both models are very closed however the second model tends to perform better in the worst scenario.

Precision-Recall Curve¶

In [158]:
precision, recall, thresholds = precision_recall_curve(Y_test, yhat)
In [159]:
get_precission_recall_curve(Y_test)
Best Threshold=0.350190, F-Score=0.830
In [160]:
score = f1_score(Y_test, y_pred)
print('F-Score: %.5f' % score)
F-Score: 0.81213

Cumulative Gains Curve¶

In [161]:
skplt.metrics.plot_cumulative_gain(Y_test, prob_predictions)
plt.show()

According to the cumulative gains curve, the balanced model is a bit worse as with a 20% of the samplle we will be explaining almost 30% of category 1 and 43% of cateogry 0 compared to the nearly 50% that we where explaining from the category 0 in the previous model.

Lift Curve¶

In [162]:
skplt.metrics.plot_lift_curve(Y_test, prob_predictions)
plt.show()

We can also see reflected in this curve what we have been commenting on the previous curve analysis. With a 20% of the sample the balanced model is a bit worse predicting than randomly guessing.

In [163]:
fbeta_score(Y_test, y_pred, beta=beta)
Out[163]:
0.821167883211679

According to the f1 and f2 scores the model is also worse. Therefore we have decided, even though that the model predicts a bit better in the worst scenario, to chose the unbalanced model for the interpretability section.

INTERPRETABILITY¶

Unbalanced Model¶

In [32]:
X_test
Out[32]:
<52336x24548 sparse matrix of type '<class 'numpy.float64'>'
	with 267462 stored elements in Compressed Sparse Row format>
In [33]:
feature_names = vectorizer.get_feature_names_out()
In [34]:
explainer = shap.Explainer(LR, X_train)
shap_values = explainer.shap_values(X_test)
In [35]:
colour_test = pd.DataFrame(X_test.todense())

Global Interpretability¶

In [36]:
shap.summary_plot(shap_values, colour_test, feature_names=feature_names)

The summary plot above shows the top 20 features based on their feature importance for the predictions. The SHAP value on the x-axis shows whether the feature effected a higher or lower prediction probability. Each dot represents a different test observation and the colour of the dot is how important that feature was for that particular prediction.

As expected, many disaster-related words such as “great”, “delicous”, “amazing”, “love”, "excellent" and more are present in the top 20 feature set.

From this graph we can interpret that a high value in words like "great", "delicious", "excellent", for example leads us to categorize a review as a "good" experience, that is, to fall on the 1 category that would correspond to the review ratings with 4 or 5 stars.

On the other hand, words like "table", "minutes", "hour", "ok" leads us to categorize a review as a "bad" experience, that is, to fall on the zero category that would correspond to the review ratings with 1, 2 or 3 stars.

Local Interpretability¶

In [43]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[5,:], 
                colour_test.iloc[5,:], feature_names=feature_names)
Out[43]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

In the force plot above, we have an example of a review prediction where the output value (i.e. the prediction for this observation) was 4.22. The base value is the predicted value if we did not have any knowledge of the features (it is the mean prediction value). Higher scores lead the model to predict 1 and lower scores lead the model to predict 0. So this particular review was ultimately classified as category (1), because they were pushed higher by all the factors shown in red

The red colour means that each feature pushed the prediction probability higher, whereas blue would have pushed the probability lower. Here we can see how words such as “wonderful” contributed to a higher good review prediction probability. Besides, words such us "table" have a negative impact on the prediction.

In [139]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[2760,:], 
                colour_test.iloc[2760,:], feature_names=feature_names)
Out[139]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

In this case, we have an example of another review prediction where the output value was -1.98.

Here we can see how words such as “good” contributed to a higher good review prediction probability. Besides, words such us "bad", "service", "star" have had a bigger negative impact on the prediction pushing the prediction to a negative -1.98.